Depression Study

cpes.raw <- as.data.frame(read_sav("data/20240-0001-Data.sav"))
if (file.exists("data/cpes.Rda")) {
  file.remove("data/cpes.Rda")
}
saveRDS(cpes.raw, file = "data/cpes.Rda")

Abstract

blablabla

Data Cleaning

Naming & Classification

cpes.raw <- readRDS("data/cpes.Rda")
cpes.cleaned <- cpes.raw %>% rename(
  "sex" = "V09036",
  "age" = "V07306",
  "region" = "V08992",
  "maritalStatus" = "V08759",
  "weightClass" = "V08823",
  "height.inches" = "V06728",
  "race" = "RANCEST",
  "householdIncome" = "V08683",
  "timesMarried" = "V09402",
  "yearsOfEducation.respondent" = "V03085",
  "parents.USBorn" = "V03125",
  "yearsOfEducation.father" = "V03081",
  "yearsOfEducation.mother" = "V03084",
  "employment.initialAge" = "V09390",
  "employment.current" = "V09154",
  
  #Adult Demographics
  "homeless.historical" = "V05633",
  "relativeWellOffRank.usPop" = "V05668",
  "relativeWellOffRank.local" = "V05669",
  "religiousAttendance.freq" = "V05629",
  "motherAlive" = "V05672",
  "motherDeath.respondentAge" = "V05674",
  "fatherAlive" = "V05686",
  "fatherDeath.respondentAge" = "V05689",
  
  #Childhood Demographics
  "highschool.numAttended" = "V05732",
  "districtHadMiddleSchool" = "V05733",
  "gradeSchool.relativeAge" = "V05741",
  "religiousImportance.childhood" = "V05713",
  "familyHeadWorkTime.male" = "V05814",
  "familyHeadWorkTime.female" = "V05820",
  "neglect.insuffSupervision" = "V05836",
  "neglect.dangerousChores" = "V05835",
  "neglect.parentSpendOnSelf" = "V05837",
  "neglect.hunger" = "V05838",
  "neglect.insuffMedical" = "V05839",
  "femaleCaretaker.closeness" = "V05841",
  "femaleCaretaker.strictRules" = "V05845",
  "femaleCaretaker.depression" = "V05846",
  "femaleCaretaker.depression.hospitalized" = "V05850",
  "femaleCaretaker.anxiety" = "V05852",
  "femaleCaretaker.employmentDifficulty" = "V05865",
  "femaleCaretaker.physicalFights" = "V05867",
  "femaleCaretaker.crime" = "V05868",
  "femaleCaretaker.suicideAttempt" = "V05871",
  "maleCaretaker.closeness" = "V05874",
  "maleCaretaker.strictRules" = "V05878",
  "maleCaretaker.depression" = "V05879",
  "maleCaretaker.depression.hospitalized" = "V05883",
  "maleCaretaker.anxiety" = "V05885",
  "maleCaretaker.employmentDifficulty" = "V05900",
  "maleCaretaker.physicalFights" = "V05902",
  "maleCaretaker.crime" = "V05903",
  "maleCaretaker.suicideAttempt" = "V05906",
  
  #Temporal
  "physicalHealth.30day" = "V04445",
  "healthWorkAbsence.30day" = "V04452",
  "difficultyConcentrating.30day" = "V04469",
  "difficultyUnderstandingSituation.30day" = "V04470",
  "difficutlyRemembering.30day" = "V04471",
  "daysMobilityDifficult.30day" = "V04474",
  "socialLifeDifficulty.30day" = "V04483",
  "daysOvertime.30day" = "V05219",
  "jobPerformance.30day" = "V05239",
  
  #Discrimination
  "lessRespect.freq" = "V06539",
  "nameCalled.freq" = "V06545",
  "dislikedForRace" = "V06550",
  
  #Employment
  "wksUnemployed.12month" = "V05162",
  "workHrs.week" = "V05204",
  
  #Family Burden
  "closeFamily.alive" = "V06280",
  "closeRelative.cancer" = "V06282",
  "closeRelative.seriousHeartProblem" = "V06290",
  "closeRelative.seriousMemoryProblem" = "V06298",
  "closeRelative.depression" = "V06344",
  "closeRelative.anxiety" = "V06352",
  
  #Family Cohesion
  "family.respectEachother" = "V06523",
  "proudOfFamily" = "V06528",
  "family.enjoyTogetherFreeTime" = "V06530",
  "family.closenessHindersPersonalGoals" = "V06533",
  "family.argueCustoms" = "V06534",
  
  #Finances
  "finances.numberPplSupportingHousehold" = "V05315",
  "afterLiquidationAndDebts.balance" = "V05312",
  "financialNeedsMet" = "V05321",
  "monthlyBillDifficulty" = "V05322",
  "insuffFoodMoney.prevYear" = "V05325",
  
  #Gambling
  "gamble.totalCount" = "V04950",
  "gamble.ageFirstExposure" = "V04962",
  
  #Marriage
  "marriage.divorceAnnulmentCount" = "V09403",
  "marriage.regretMarriage" = "V05380",
  "spouse.tooMuchAlcoholDrugs" = "V05406",
  "spouse.jobDifficulty" = "V05415",
  "spouse.financeDisagreement" = "V05373",
  "spouse.lifePhilosophyDisagreement" = "V05376",
  "spouse.majorDecisionDisagreement" = "V05377",
  "spouse.quarrel" = "V05379",
  
  #Personality
  "personality.transparentFeelings" = "V03232",
  "personality.indecisiveAbtSelfPath" = "V03240",
  "personality.feelBadWhenHurtAnother" = "V03243",
  "personality.willLieToAchieveGoal" = "V03245",
  "personality.takeRiskyChances" = "V03247",
  "personality.argueWhenStopped" = "V03252",
  "personality.letOthersMakeBigDecisions" = "V03255",
  "personality.dislikeAlone" = "V03256",
  "personality.askAdviceAbtEverydayDecisions" = "V03257",
  "personality.sociallyAwkward" = "V03261",
  "personality.preferSoloActivities" = "V03263",
  "personality.easyToGetCloseToOthers" = "V05514",
  
  #Screening
  "cigsPerDay" = "V04601",
  
  #Neighborhood
  "neighborhood.trustworthy" = "V06497",
  "neighborhood.nightSafety" = "V06501",
  
  #Social Networks
  "freqTalkWithRelatives" = "V05501",
  "freqArgueWithRelatives" = "V05505",
  "freqFriendInteraction" = "V05506",
  "freqTransparentPersonalProblems" = "V05513",
  
  ##DX: Major Depressive Disorder
  "disorder.lifetime" = "V07876",
  "disorder.12Month" = "V07655",
  "disorder.30day" = "V07657",
  "disorder.onset" = "V08766",
  "disorder.recency" = "V08768"
)

Important variables were selected and renamed according to official documentation (found in the same folder as the raw data .sav file after downloading from this site: LINK)

Additionally, categorical and continuous variables were converted to factors and doubles respectively, for compatibility with later algorithms.

Categorical variables and continuous variables are listed below:

Categorical (Refer to Appendix for Detailed Breakdown of Variable Categories):
Current Employment Status (Time of Survey)
Marital Status (Time of Survey)
Mother Alive (Time of Survey)
Father Alive (Time of Survey)
Financial Stability After Liquidating and Paying Debts (Time of Survey)
Physical Health (Past 30 Days)
Difficulty Concentrating (Past 30 Days)
Difficulty Understanding Situations (Past 30 Days)
Difficulty Remembering (Past 30 Days)
Social Life Difficulty (Past 30 Days)
Financial Needs Met (Past Year)
Monthly Bill Difficulty (Past Year)
Freq Insufficient Money for Food (Past Year)
Spouse Takes Too Much Alcohol/Drugs (Past Year)
Spouse Has Trouble Finding Job (Past Year)
Freq Talk With Relatives (Past Year)
Freq Argue With Relatives (Past Year)
Freq Interact With Friends (Past Year)
Freq Transparent With Personal Problems (Past Year)
Homeless (Past or Present)
Trustworthy Neighborhood (Past Year)
Neighborhood Safe At Night (Past Year)
Family Mutual Respect (Past Year)
Proud of Family (Past Year)
Family Enjoys Free Time Together (Past Year)
Family Closeness Obstructs Personal Goals (Past Year)
Family Argues Customs (Past Year)
Freq Less Respect vs. Peers (Past Year)
Freq Name Called (Past Year)
Freq Disliked for Race (Past Year)
Weight Class (Past Year)
Freq Attend Church (Past Year)
Freq Disagree With Spouse on Finances (Past Year)
Freq Disagree With Spouse on Life Philosophy
Freq Disagree With Spouse on Major Decisions (Past Year)
Freq Quarrel with Spouse (Past Year)
Freq Regret Marriage (Past Year)
Close Relative has Cancer (Past Year)
Close Relative has Serious Heart Problem (Past Year)
Close Relative has Serious Memory Problem (Past Year)
Close Relative has Depression (Past Year)
Close Relative has Anxiety (Past Year)
District Had Middle School (Childhood)
Relative Age During Grade School (Childhood)
Religious Importance (Childhood)
Male Head Time Spend At Work (Childhood)
Female Head Time Spend At Work (Childhood)
Freq Doing Dangerous Chores (Childhood)
Freq Insufficiently Supervised (Childhood)
Freq Parent Neglected Respondent and Spent on Self (Childhood)
Freq Hungry Due To Neglect (Childhood)
Freq Insufficient Medical Care (Childhood)
Closeness to Female Caretaker (Childhood)
Female Caretaker had Strict Rules (Childhood)
Female Caretaker had Depression (Childhood)
Female Caretaker Hospitalized for Depression (Childhood)
Female Caretaker had Anxiety (Childhood)
Female Caretaker had Difficulty Finding Job (Childhood)
Female Caretaker had Physical Fights (Childhood)
Female Caretaker Engaged in Crime (Childhood)
Female Caretaker Attempted Suicide (Childhood)
Closeness to Male Caretaker (Childhood)
Male Caretaker had Strict Rules (Childhood)
Male Caretaker had Depression (Childhood)
Male Caretaker Hospitalized for Depression (Childhood)
Male Caretaker had Anxiety (Childhood)
Male Caretaker had Difficulty Finding Job (Childhood)
Male Caretaker had Physical Fights (Childhood)
Male Caretaker Engaged in Crime (Childhood)
Male Caretaker Attempted Suicide (Childhood)
Transparent Personality
Indecisive About Own Future
Feel Bad When Harm Another
Will Lie to Achieve Goals
Take Risks
Argue When Stopped
Let Others Make Big Decisions
Dislike Isolation
Ask Advice About Everyday Decisions
Easy to Get Close to Others
Socially Awkward
Prefer Solo Activities
Race
Sex
Region (US Census)

Continuous: Age (Time of Survey)
Height (Time of Survey)
Number of People Financially Supporting Household (Time of Survey)
Health Related Work Absence (Past 30 Days)
Difficult Mobility (Past 30 Days)
Worked Overtime (Past 30 Days)
Job Performance (Past 30 Days)
Relative Well Being vs. USPop (Past Year)
Relative Well Being vs. Neighborhood (Past Year)
Work Hrs per Week (Past Year)
Household Income (Past Year)
Weeks Unemployed (Past Year)
Cigarettes per Day (Past Year)
Times Married
Number of Divorce/Annulments
Years of Education
Times Gambled
Age First Exposed to Gambling
Age at Time of Mother’s Death
Age at Time of Father’s Death
High-schools Attended
US Born Parents
Alive Close Family
Father Years of Education
Mother Years of Education
Age First Employed

Factoring and Subsetting

cpes.cleaned <- cpes.cleaned %>% select(-starts_with("V") & -starts_with("NC") & -starts_with("NS") & -starts_with("NL") & -starts_with("CPES") & -c("SESTRAT", "SECLUSTR"))

####convert appropriate variables to factor####
cpes.cleaned %<>% mutate_at(
  c("sex", "region", "maritalStatus","weightClass", "race", "employment.current", "physicalHealth.30day", "difficultyConcentrating.30day", "difficultyUnderstandingSituation.30day", "difficutlyRemembering.30day", "socialLifeDifficulty.30day", "afterLiquidationAndDebts.balance", "spouse.tooMuchAlcoholDrugs", "financialNeedsMet","monthlyBillDifficulty","insuffFoodMoney.prevYear", "spouse.jobDifficulty", "freqTalkWithRelatives", "freqArgueWithRelatives", "freqFriendInteraction", "freqTransparentPersonalProblems", "homeless.historical", "neighborhood.trustworthy", "neighborhood.nightSafety", "family.respectEachother", "proudOfFamily", "family.enjoyTogetherFreeTime", "family.closenessHindersPersonalGoals", "family.argueCustoms", "lessRespect.freq", "nameCalled.freq", "dislikedForRace", "religiousAttendance.freq", "personality.transparentFeelings", "personality.indecisiveAbtSelfPath", "personality.feelBadWhenHurtAnother", "personality.willLieToAchieveGoal", "personality.takeRiskyChances", "personality.argueWhenStopped", "personality.letOthersMakeBigDecisions", "personality.dislikeAlone", "personality.askAdviceAbtEverydayDecisions", "personality.sociallyAwkward", "personality.preferSoloActivities", "spouse.financeDisagreement", "spouse.lifePhilosophyDisagreement", "spouse.majorDecisionDisagreement", "spouse.quarrel", "marriage.regretMarriage", "personality.easyToGetCloseToOthers", "motherAlive", "fatherAlive", "districtHadMiddleSchool", "gradeSchool.relativeAge", "religiousImportance.childhood", "familyHeadWorkTime.male", "familyHeadWorkTime.female", "neglect.dangerousChores", "neglect.insuffSupervision", "neglect.parentSpendOnSelf", "neglect.hunger", "neglect.insuffMedical", "femaleCaretaker.closeness", "femaleCaretaker.strictRules", "femaleCaretaker.depression", "femaleCaretaker.depression.hospitalized", "femaleCaretaker.anxiety", "femaleCaretaker.employmentDifficulty", "femaleCaretaker.physicalFights", "femaleCaretaker.crime", "femaleCaretaker.suicideAttempt", "maleCaretaker.closeness", "maleCaretaker.strictRules", "maleCaretaker.depression", "maleCaretaker.depression.hospitalized", "maleCaretaker.anxiety", "maleCaretaker.employmentDifficulty", "maleCaretaker.physicalFights", "maleCaretaker.crime", "maleCaretaker.suicideAttempt", "closeRelative.cancer", "closeRelative.seriousHeartProblem", "closeRelative.seriousMemoryProblem", "closeRelative.depression", "closeRelative.anxiety", "disorder.30day" ,"disorder.12Month" ,"disorder.lifetime"), as.factor)

####convert appropriate variables to numeric####
cpes.cleaned %<>% mutate_at(
  c("age", "height.inches","householdIncome","timesMarried", "finances.numberPplSupportingHousehold", "wksUnemployed.12month", "marriage.divorceAnnulmentCount", "yearsOfEducation.respondent", "workHrs.week","cigsPerDay","gamble.totalCount", "gamble.ageFirstExposure", "daysOvertime.30day", "motherDeath.respondentAge", "fatherDeath.respondentAge","highschool.numAttended", "closeFamily.alive", "parents.USBorn", "yearsOfEducation.father","yearsOfEducation.mother","employment.initialAge", "jobPerformance.30day", "disorder.recency","disorder.onset", "healthWorkAbsence.30day", "daysMobilityDifficult.30day", "relativeWellOffRank.usPop", "relativeWellOffRank.local"), as.numeric)

#####cat####
cat <- cpes.cleaned[,names(cpes.cleaned)=="CASEID" | sapply(cpes.cleaned, is.factor)]
cat <- select(cat, -c("disorder.lifetime", "disorder.12Month", "disorder.30day"))
cat$employment.current <- recode_factor(cat$employment.current, '1' = "Employed", '2' = "Unemployed", '3' = "Not In Labor Force")
cat$physicalHealth.30day <- recode_factor(cat$physicalHealth.30day, '1' = "Better", '2' = "Worse", '3' = "About The Same")
cat$homeless.historical <- recode_factor(cat$homeless.historical, '1' = "Yes", '5' = "No")
cat$difficultyConcentrating.30day <- recode_factor(cat$difficultyConcentrating.30day, '1' = "None", '2' = "Mild", '3' = "Moderate", '4' = "Severe", '5' = "Impossible")
cat$difficultyUnderstandingSituation.30day <- recode_factor(cat$difficultyUnderstandingSituation.30day, '1' = "None", '2' = "Mild", '3' = "Moderate", '4' = "Severe", '5' = "Impossible")
cat$difficutlyRemembering.30day <- recode_factor(cat$difficutlyRemembering.30day, '1' = "None", '2' = "Mild", '3' = "Moderate", '4' = "Severe", '5' = "Impossible")
cat$socialLifeDifficulty.30day <- recode_factor(cat$socialLifeDifficulty.30day, '1' = "Yes", '5' = "No")
cat$afterLiquidationAndDebts.balance <- recode_factor(cat$afterLiquidationAndDebts.balance, '1' = "Money Left", '2' = "Owe Money", '3' = "Debts Equal Assets", '4' = "Don't Owe Anything")
cat$financialNeedsMet <- recode_factor(cat$financialNeedsMet, '1' = "More Than Need", '2' = "Just Enough", '3' = "Not Enough")
cat$monthlyBillDifficulty <- recode_factor(cat$monthlyBillDifficulty, '1' = "Very Difficult", '2' = "Somewhat Difficult", '3' = "Not Very Difficult", '4' = "Not At All Difficult")
cat$insuffFoodMoney.prevYear <- recode_factor(cat$insuffFoodMoney.prevYear, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$spouse.tooMuchAlcoholDrugs <- recode_factor(cat$spouse.tooMuchAlcoholDrugs, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$spouse.jobDifficulty <- recode_factor(cat$spouse.jobDifficulty, '1' = "Yes", '5' = "No", '7' = "Does Not Apply/Spouse Never Worked")
cat$freqTalkWithRelatives <- recode_factor(cat$freqTalkWithRelatives, '1' = "Most Every Day", '2' = "A Few Times A Week", '3' = "A Few Times A Month", '4' = "Once A Month", '5' = "Less Than Once A Month")
cat$freqArgueWithRelatives <- recode_factor(cat$freqArgueWithRelatives, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$freqFriendInteraction <- recode_factor(cat$freqFriendInteraction, '1' = "Most Every Day", '2' = "A Few Times A Week", '3' = "A Few Times A Month", '4' = "Once A Month", '5' = "Less Than Once A Month")
cat$freqTransparentPersonalProblems <- recode_factor(cat$freqTransparentPersonalProblems, '1' = "Always", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$freqTransparentPersonalProblems <- recode_factor(cat$freqTransparentPersonalProblems, '1' = "Yes", '2' = "No")
cat$neighborhood.trustworthy <- recode_factor(cat$neighborhood.trustworthy, '1' = "Very True", '2' = "Somewhat True", '3' = "Not Very True", '4' = "Not At All True")
cat$neighborhood.nightSafety <- recode_factor(cat$neighborhood.nightSafety, '1' = "Very True", '2' = "Somewhat True", '3' = "Not Very True", '4' = "Not At All True")
cat$family.respectEachother <- recode_factor(cat$family.respectEachother, '1' = "Strongly Agree", '2' = "Somewhat Agree", '3' = "Somewhat Disagree", '4' = "Strongly Disagree")
cat$proudOfFamily <- recode_factor(cat$proudOfFamily, '1' = "Strongly Agree", '2' = "Somewhat Agree", '3' = "Somewhat Disagree", '4' = "Strongly Disagree")
cat$family.enjoyTogetherFreeTime <- recode_factor(cat$family.enjoyTogetherFreeTime, '1' = "Strongly Agree", '2' = "Somewhat Agree", '3' = "Somewhat Disagree", '4' = "Strongly Disagree")
cat$family.closenessHindersPersonalGoals <- recode_factor(cat$family.closenessHindersPersonalGoals, '1' = "Hardly Ever/Never", '2' = "Sometimes", '3' = "Often")
cat$family.argueCustoms <- recode_factor(cat$family.argueCustoms, '1' = "Hardly Ever/Never", '2' = "Sometimes", '3' = "Often")
cat$lessRespect.freq <- recode_factor(cat$lessRespect.freq, '1' = "Almost Every Day", '2' = "At Least Once A Week", '3' = "A Few Times A Month", '4' = "A Few Times A Year", '5' = "Less Than Once A Year", '6' = "Never")
cat$nameCalled.freq <- recode_factor(cat$nameCalled.freq, '1' = "Almost Every Day", '2' = "At Least Once A Week", '3' = "A Few Times A Month", '4' = "A Few Times A Year", '5' = "Less Than Once A Year", '6' = "Never")
cat$dislikedForRace <- recode_factor(cat$dislikedForRace, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$sex <- recode_factor(cat$sex, '1' = "Male", '2' = "Female")
cat$region <- recode_factor(cat$region, '1' = "Northeast", '2' = "Midwest", '3' = "South", '4' = "West")
cat$maritalStatus <- recode_factor(cat$maritalStatus, '1' = "Married/Cohabiting", '2' = "Divorced/Separated/Widowed", '3' = "Never Married")
cat$religiousAttendance.freq <- recode_factor(cat$religiousAttendance.freq, '1' = "More Than Once A Week", '2' = "About Once A Week", '3' = "One To Three Times A Month", '4' = "Less Than Once A Month", '5' = "Never")
cat$weightClass <- recode_factor(cat$weightClass, '1' = "Underweight", '2' = "Healthy", '3' = "Overweight", '4' = "Obesity Class 1", '5' = "Obesity Class 2", '5' = "Obesity Class 3")
cat$race <- recode_factor(cat$race, '1' = "Vietnamese", '2' = "Filipino", '3' = "Chinese", '4' = "All Other Asian", '5' = "Cuban", '6' = "PuertoRican", '7' = "Mexican", '8' = "All Other Hispanic", '9' = "Afro-Carribean", '10' = "African American", '11' = "Non-Hispanic Whites", '12' = "All Other")
cat$personality.transparentFeelings <- recode_factor(cat$personality.transparentFeelings, '1' = "TRUE", '5' = "FALSE")
cat$personality.indecisiveAbtSelfPath <- recode_factor(cat$personality.indecisiveAbtSelfPath, '1' = "TRUE", '5' = "FALSE")
cat$personality.feelBadWhenHurtAnother <- recode_factor(cat$personality.feelBadWhenHurtAnother, '1' = "TRUE", '5' = "FALSE")
cat$personality.willLieToAchieveGoal <- recode_factor(cat$personality.willLieToAchieveGoal, '1' = "TRUE", '5' = "FALSE")
cat$personality.takeRiskyChances <- recode_factor(cat$personality.takeRiskyChances, '1' = "TRUE", '5' = "FALSE")
cat$personality.letOthersMakeBigDecisions <- recode_factor(cat$personality.letOthersMakeBigDecisions, '1' = "TRUE", '5' = "FALSE")
cat$personality.argueWhenStopped <- recode_factor(cat$personality.argueWhenStopped, '1' = "TRUE", '5' = "FALSE")
cat$personality.dislikeAlone <- recode_factor(cat$personality.dislikeAlone, '1' = "TRUE", '5' = "FALSE")
cat$personality.askAdviceAbtEverydayDecisions <- recode_factor(cat$personality.askAdviceAbtEverydayDecisions, '1' = "TRUE", '5' = "FALSE")
cat$personality.sociallyAwkward <- recode_factor(cat$personality.sociallyAwkward, '1' = "TRUE", '5' = "FALSE")
cat$personality.preferSoloActivities <- recode_factor(cat$personality.preferSoloActivities, '1' = "TRUE", '5' = "FALSE")
cat$spouse.financeDisagreement <- recode_factor(cat$spouse.financeDisagreement, '1' = "All Of The Time", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$spouse.lifePhilosophyDisagreement <- recode_factor(cat$spouse.lifePhilosophyDisagreement, '1' = "All Of The Time", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$spouse.quarrel <- recode_factor(cat$spouse.quarrel, '1' = "All Of The Time", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$spouse.majorDecisionDisagreement <- recode_factor(cat$spouse.majorDecisionDisagreement, '1' = "All Of The Time", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$marriage.regretMarriage <- recode_factor(cat$marriage.regretMarriage, '1' = "All Of The Time", '2' = "Most Of The Time", '3' = "Sometimes", '4' = "Rarely", '5' = "Never")
cat$personality.easyToGetCloseToOthers <- recode_factor(cat$personality.easyToGetCloseToOthers,  '1' = "Strongly Agree", '2' = "Somewhat Agree", '3' = "Somewhat Disagree", '4' = "Strongly Disagree")
cat$motherAlive <- recode_factor(cat$motherAlive, '1' = "Yes", '5' = "No")
cat$fatherAlive <- recode_factor(cat$fatherAlive, '1' = "Yes", '5' = "No")
cat$gradeSchool.relativeAge <- recode_factor(cat$gradeSchool.relativeAge, '1' = "Younger", '2' = "Older", '3' = "Average", '4' = "Varied")
cat$districtHadMiddleSchool <- recode_factor(cat$districtHadMiddleSchool, '1' = "Yes", '5' = "No")
cat$religiousImportance.childhood <- recode_factor(cat$religiousImportance.childhood, '1' = "Very Important", '2' = "Somewhat Important", '3' = "Not Very Important", '4' = "Not At All Important")
cat$familyHeadWorkTime.female <- recode_factor(cat$familyHeadWorkTime.female, '1' = "All", '2' = "Most", '3' = "Some", '4' = "A Little", '5' = "Not At All")
cat$familyHeadWorkTime.male <- recode_factor(cat$familyHeadWorkTime.male, '1' = "All", '2' = "Most", '3' = "Some", '4' = "A Little", '5' = "Not At All")
cat$neglect.insuffSupervision <- recode_factor(cat$neglect.insuffSupervision, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$neglect.parentSpendOnSelf <- recode_factor(cat$neglect.parentSpendOnSelf, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$neglect.dangerousChores <- recode_factor(cat$neglect.dangerousChores, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$neglect.hunger <- recode_factor(cat$neglect.hunger, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$neglect.insuffMedical <- recode_factor(cat$neglect.insuffMedical, '1' = "Often", '2' = "Sometimes", '3' = "Rarely", '4' = "Never")
cat$femaleCaretaker.closeness <- recode_factor(cat$femaleCaretaker.closeness, '1' = "Very Close", '2' = "Somewhat Close", '3' = "Not Very Close", '4' = "Not At All Close")
cat$femaleCaretaker.strictRules <- recode_factor(cat$femaleCaretaker.strictRules, '1' = "A Lot", '2' = "Some", '3' = "A Little", '4' = "Not At All")
cat$femaleCaretaker.depression.hospitalized <- recode_factor(cat$femaleCaretaker.depression.hospitalized, '1' = "Yes", '5' = "No")
cat$femaleCaretaker.depression <- recode_factor(cat$femaleCaretaker.depression, '1' = "Yes", '5' = "No")
cat$femaleCaretaker.anxiety <- recode_factor(cat$femaleCaretaker.anxiety, '1' = "Yes", '5' = "No")
cat$femaleCaretaker.physicalFights <- recode_factor(cat$femaleCaretaker.physicalFights, '1' = "Yes", '5' = "No")
cat$femaleCaretaker.crime <- recode_factor(cat$femaleCaretaker.crime, '1' = "Yes", '5' = "No")
cat$femaleCaretaker.employmentDifficulty <- recode_factor(cat$femaleCaretaker.employmentDifficulty, '1' = "Yes", '5' = "No")
cat$maleCaretaker.closeness <- recode_factor(cat$maleCaretaker.closeness, '1' = "Very Close", '2' = "Somewhat Close", '3' = "Not Very Close", '4' = "Not At All Close")
cat$femaleCaretaker.suicideAttempt <- recode_factor(cat$femaleCaretaker.suicideAttempt, '1' = "Yes", '5' = "No")
cat$maleCaretaker.strictRules <- recode_factor(cat$maleCaretaker.strictRules, '1' = "A Lot", '2' = "Some", '3' = "A Little", '4' = "Not At All")
cat$maleCaretaker.depression.hospitalized <- recode_factor(cat$maleCaretaker.depression.hospitalized, '1' = "Yes", '5' = "No")
cat$maleCaretaker.depression <- recode_factor(cat$maleCaretaker.depression, '1' = "Yes", '5' = "No")
cat$maleCaretaker.anxiety <- recode_factor(cat$maleCaretaker.anxiety, '1' = "Yes", '5' = "No")
cat$maleCaretaker.employmentDifficulty <- recode_factor(cat$maleCaretaker.employmentDifficulty, '1' = "Yes", '5' = "No")
cat$maleCaretaker.physicalFights <- recode_factor(cat$maleCaretaker.physicalFights, '1' = "Yes", '5' = "No")
cat$maleCaretaker.crime <- recode_factor(cat$maleCaretaker.crime, '1' = "Yes", '5' = "No")
cat$maleCaretaker.suicideAttempt <- recode_factor(cat$maleCaretaker.suicideAttempt, '1' = "Yes", '5' = "No")
cat$closeRelative.cancer <- recode_factor(cat$closeRelative.cancer, '1' = "Yes", '5' = "No")
cat$closeRelative.seriousHeartProblem <- recode_factor(cat$closeRelative.seriousHeartProblem, '1' = "Yes", '5' = "No")
cat$closeRelative.seriousMemoryProblem <- recode_factor(cat$closeRelative.seriousMemoryProblem, '1' = "Yes", '5' = "No")
cat$closeRelative.depression <- recode_factor(cat$closeRelative.depression, '1' = "Yes", '5' = "No")
cat$closeRelative.anxiety <- recode_factor(cat$closeRelative.anxiety, '1' = "Yes", '5' = "No")

####cont#####
cont <- cpes.cleaned[,names(cpes.cleaned)=="CASEID" | sapply(cpes.cleaned, is.numeric)]
cont <- select(cont, -c("disorder.onset", "disorder.recency"))

disorder <- select(cpes.cleaned, c("CASEID", "disorder.lifetime", "disorder.12Month", "disorder.30day", "disorder.onset", "disorder.recency"))
disorder$disorder.lifetime <- recode_factor(disorder$disorder.lifetime, '1' = "Endorsed", '5' = "Not Endorsed")
disorder$disorder.12Month <- recode_factor(disorder$disorder.12Month, '1' = "Endorsed", '5' = "Not Endorsed")
disorder$disorder.30day <- recode_factor(disorder$disorder.30day, '1' = "Endorsed", '5' = "Not Endorsed")

We subset our data into separate data-frames of categorical and continuous variables for ease of use. We also rename factor levels accordingly.

Extraneous Data Removal

cont$householdIncome[cont$householdIncome < 10000] <- NA
cont$parents.USBorn <- cont$parents.USBorn - rep(1, nrow(cont))
cont$employment.initialAge[cont$employment.initialAge > cont$age] <- NA
cont$fatherDeath.respondentAge[cont$fatherDeath.respondentAge > cont$age] <- NA
cont$motherDeath.respondentAge[cont$motherDeath.respondentAge > cont$age] <- NA
cont$closeFamily.alive[cont$closeFamily.alive > 25] <- NA

Extraneous or unusual data is either modified or removed.
Annual household incomes below 10,000 are removed, as there is high likelihood they are typos.
US Born Parent variable values are mutated to be one less than their original value so numeric values now align with number of US born parents.
Initial employment age, respondent age at mother’s death, and respondent age at father’s death are all removed if the value is greater than respondent’s current age. Alive close family values above 25 are removed. Respondents likely did not interpret close family as commonly accepted.

Exploratory Data Analysis

We conduct a brief EDA, looking at surface level trends in depression rate for region, sex, age, race, income, employment, and education

Region

caseID_region <- data.frame("CASEID" = cat$CASEID, "region" = cat$region)
caseID_depressed.lifetime <- data.frame("CASEID" = disorder$CASEID, "depressed.lifetime" = disorder$disorder.lifetime)
region_depressed.lifetime <- inner_join(caseID_region, caseID_depressed.lifetime, by = "CASEID")

region_Pop <- as.data.frame(table(region_depressed.lifetime[,2])) %>% 
  rename(region = Var1) %>% rename("regionPop" = Freq)
region_depressedPop.lifetime <- as.data.frame(table(region_depressed.lifetime[region_depressed.lifetime$depressed.lifetime == 'Endorsed', c(2, 3)]))[1:(length(unique(region_depressed.lifetime$region))),-2] %>% 
  rename("positive" = Freq)

region_depressed.lifetime <- cbind(region_Pop, region_depressedPop.lifetime$positive) %>% rename(positive = 'region_depressedPop.lifetime$positive') %>% mutate(rate = positive / regionPop)
region_depressed.lifetime <- region_depressed.lifetime[region_depressed.lifetime$regionPop >= 30,]

region_depressed.lifetime$region <- factor(region_depressed.lifetime$region, 
                                           levels = unique(region_depressed.lifetime$region)[order(region_depressed.lifetime$rate, decreasing = F)])
region_Pop$region <- factor(region_depressed.lifetime$region, 
                            levels = unique(region_depressed.lifetime$region)[order(region_depressed.lifetime$rate, decreasing = F)])

regionDist.eda <- plot_ly(region_Pop, x = ~region, y = ~regionPop, type = 'bar') %>% 
  layout(title = "Population by Region", xaxis = list(title = "Region"), yaxis = list(title = "Population", range = c(0, 8000)))
regionDist.eda
region_depressed.lifetime.eda <- plot_ly(region_depressed.lifetime, x = ~region, y = ~rate, type = 'bar') %>% 
  layout(title = "% Diagnosed with MDD by region (Lifetime)", xaxis = list(title = "Region"), yaxis = list(title = "% Diagnosed with MDD (Lifetime)", range = c(0,1)))
region_depressed.lifetime.eda

The South appears to be over-represented in our sample. Though, that should not skew the depression rate measures in the second graph, in which the South appears to be the least depressed at 13% vs. the Midwest’s 17%

Gender

caseID_sex <- data.frame("CASEID" = cat$CASEID, "sex" = cat$sex)
caseID_depressed.lifetime <- data.frame("CASEID" = disorder$CASEID, "depressed.lifetime" = disorder$disorder.lifetime)
sex_depressed.lifetime <- inner_join(caseID_sex, caseID_depressed.lifetime, by = "CASEID")

sex_Pop <- as.data.frame(table(sex_depressed.lifetime[,2])) %>% 
  rename(sex = Var1) %>% rename("sexPop" = Freq)
sex_depressedPop.lifetime <- as.data.frame(table(sex_depressed.lifetime[sex_depressed.lifetime$depressed.lifetime == 'Endorsed', c(2, 3)]))[1:(length(unique(sex_depressed.lifetime$sex))),-2] %>% 
  rename("positive" = Freq)

sex_depressed.lifetime <- cbind(sex_Pop, sex_depressedPop.lifetime$positive) %>% rename(positive = 'sex_depressedPop.lifetime$positive') %>% mutate(rate = positive / sexPop)
sex_depressed.lifetime <- sex_depressed.lifetime[sex_depressed.lifetime$sexPop >= 30,]

sex_depressed.lifetime$sex <- factor(sex_depressed.lifetime$sex, 
                                     levels = unique(sex_depressed.lifetime$sex)[order(sex_depressed.lifetime$rate, decreasing = F)])
sex_Pop$sex <- factor(sex_depressed.lifetime$sex, 
                      levels = unique(sex_depressed.lifetime$sex)[order(sex_depressed.lifetime$rate, decreasing = F)])

sexDist.eda <- plot_ly(sex_Pop, x = ~sex, y = ~sexPop, type = 'bar') %>% 
  layout(title = "Population by sex", xaxis = list(title = "sex"), yaxis = list(title = "Population", range = c(0, 12000)))
sexDist.eda
sex_depressed.lifetime.eda <- plot_ly(sex_depressed.lifetime, x = ~sex, y = ~rate, type = 'bar') %>% 
  layout(title = "% Diagnosed with MDD by sex (Lifetime)", xaxis = list(title = "sex"), yaxis = list(title = "% Diagnosed with MDD (Lifetime)", range = c(0,1)))
sex_depressed.lifetime.eda

Females are over-represented in our sample. They also have a higher depression rate at 16.9% compared to 10.5% for males

Age

caseID_age <- data.frame("CASEID" = cont$CASEID, "age" = cont$age)
caseID_depressed.12Month <- data.frame("CASEID" = disorder$CASEID, "depressed.12Month" = disorder$disorder.12Month)
age_depressed.12Month <- inner_join(caseID_age, caseID_depressed.12Month, by = "CASEID")

age_Pop <- as.data.frame(table(age_depressed.12Month[,2])) %>% 
  rename(age = Var1) %>% rename("agePop" = Freq)
age_depressedPop.12Month <- as.data.frame(table(age_depressed.12Month[age_depressed.12Month$depressed.12Month == 'Endorsed', c(2, 3)]))[1:(length(unique(age_depressed.12Month$age))),-2] %>% 
  rename("positive" = Freq)

age_depressed.12Month <- cbind(age_Pop, age_depressedPop.12Month$positive) %>% rename(positive = 'age_depressedPop.12Month$positive') %>% mutate(rate = positive / agePop)
age_depressed.12Month <- age_depressed.12Month[age_depressed.12Month$agePop >= 30,]

ageDist.eda <- plot_ly(age_Pop, x = ~age, y = ~agePop, type = 'bar') %>% 
  layout(title = "Population by age", xaxis = list(title = "age"), yaxis = list(title = "Population", range = c(0, 600)))
ageDist.eda
age_depressed.12Month.eda <- plot_ly(age_depressed.12Month, x = ~age, y = ~rate, type = 'scatter', mode = 'lines') %>% 
  layout(title = "% Positive for MDD by age (12 Month)", xaxis = list(title = "age"), yaxis = list(title = "% Positive for MDD (12 Month)", range = c(0,0.5)))
age_depressed.12Month.eda

The age distribution matches that of the United States. For the graph of depression rates, ages from 87 to 99 were excluded, as they had less than 30 observations. Depression rate appears to generally trend downward as age increases.

Race

caseID_race <- data.frame("CASEID" = cat$CASEID, "race" = cat$race)
caseID_depressed.lifetime <- data.frame("CASEID" = disorder$CASEID, "depressed.lifetime" = disorder$disorder.lifetime)
race_depressed.lifetime <- inner_join(caseID_race, caseID_depressed.lifetime, by = "CASEID")

race_Pop <- as.data.frame(table(race_depressed.lifetime[,2])) %>% 
  rename(race = Var1) %>% rename("racePop" = Freq)
race_depressedPop.lifetime <- as.data.frame(table(race_depressed.lifetime[race_depressed.lifetime$depressed.lifetime == 'Endorsed', c(2, 3)]))[1:(length(unique(race_depressed.lifetime$race))),-2] %>% 
  rename("positive" = Freq)

race_depressed.lifetime <- cbind(race_Pop, race_depressedPop.lifetime$positive) %>% rename(positive = 'race_depressedPop.lifetime$positive') %>% mutate(rate = positive / racePop)
race_depressed.lifetime <- race_depressed.lifetime[race_depressed.lifetime$racePop >= 30,]

race_depressed.lifetime$race <- factor(race_depressed.lifetime$race, 
                                       levels = unique(race_depressed.lifetime$race)[order(race_depressed.lifetime$rate, decreasing = F)])
race_Pop$race <- factor(race_depressed.lifetime$race, 
                        levels = unique(race_depressed.lifetime$race)[order(race_depressed.lifetime$rate, decreasing = F)])

raceDist.eda <- plot_ly(race_Pop, x = ~race, y = ~racePop, type = 'bar') %>% 
  layout(title = "Population by Race", xaxis = list(title = "Race"), yaxis = list(title = "Population", range = c(0, 8000)))
raceDist.eda
race_depressed.lifetime.eda <- plot_ly(race_depressed.lifetime, x = ~race, y = ~rate, type = 'bar') %>% 
  layout(title = "% Diagnosed with MDD by Race", xaxis = list(title = "Race"), yaxis = list(title = "% Diagnosed with MDD", range = c(0,1)))
race_depressed.lifetime.eda

The race distribution generally matches that of the United States. Depression rate appears to be the highest for “All Other” Minorities, Non-Hispanic Whites, and Puerto Ricans.

Household Income

caseID_householdIncome <- data.frame("CASEID" = cont$CASEID, "householdIncome" = cont$householdIncome)
caseID_depressed.12Month <- data.frame("CASEID" = disorder$CASEID, "depressed.12Month" = disorder$disorder.12Month)
householdIncome_depressed.12Month <- inner_join(caseID_householdIncome, caseID_depressed.12Month, by = "CASEID")
householdIncome_depressed.12Month <- mutate(householdIncome_depressed.12Month, householdIncome = round_any(householdIncome, 5000, f = floor))

householdIncome_Pop <- as.data.frame(table(householdIncome_depressed.12Month[,2])) %>% 
  rename(householdIncome = Var1) %>% 
  rename(householdIncomePop = Freq)
householdIncome_depressedPop.12Month <- as.data.frame(table(householdIncome_depressed.12Month[householdIncome_depressed.12Month$depressed.12Month == 'Endorsed', c(2, 3)]))[1:(length(unique(householdIncome_depressed.12Month$householdIncome))),-2] %>% 
  rename("positive" = Freq)

householdIncome_depressed.12Month <- merge(householdIncome_Pop, householdIncome_depressedPop.12Month, by = "householdIncome") %>% mutate(rate = positive / householdIncomePop)
householdIncome_depressed.12Month <- householdIncome_depressed.12Month[householdIncome_depressed.12Month$householdIncomePop >= 30,]

householdIncome_depressed.12Month <- householdIncome_depressed.12Month[householdIncome_depressed.12Month$positive != 0,]
householdIncome_depressed.12Month <- householdIncome_depressed.12Month[order(householdIncome_depressed.12Month$householdIncome),]

householdIncomeDist.eda <- plot_ly(householdIncome_Pop, x = ~householdIncome, y = ~householdIncomePop, type = 'bar') %>% 
  layout(title = "Population by Annual Household Income (To Nearest 5k)", xaxis = list(title = "Population by Annual Household Income (Rounded to Nearest 5k)"), yaxis = list(title = "Population", range = c(0, 1500)))
householdIncomeDist.eda
householdIncome_depressed.12Month.eda <- plot_ly(householdIncome_depressed.12Month, x = ~householdIncome, y = ~rate, type = 'scatter', mode ='lines') %>% 
  layout(title = "% Positive for MDD by Household Income", xaxis = list(title = "Annual Household Income (To Nearest 5k)"), yaxis = list(title = "% Positive for MDD", range = c(0,0.5)))
householdIncome_depressed.12Month.eda

Income distribution appears to generally follow that of the United States, though our model has income capped at 200,000. Incomes have also been rounded to the nearest floor by increments of 5000, and observations of less than 30 have been omitted. Depression rate generally decreases from 10k to 100k, increases from 100k to 145k, drops at 150k, and remains steady afterward, with perhaps a slight increase.

Employment

caseID_employment.current <- data.frame("CASEID" = cat$CASEID, "employment.current" = cat$employment.current)
caseID_depressed.12Month <- data.frame("CASEID" = disorder$CASEID, "depressed.12Month" = disorder$disorder.12Month)
employment.current_depressed.12Month <- inner_join(caseID_employment.current, caseID_depressed.12Month, by = "CASEID")

employment.current_Pop <- as.data.frame(table(employment.current_depressed.12Month[,2])) %>% 
  rename(employment.current = Var1) %>% rename("employment.currentPop" = Freq)
employment.current_depressedPop.12Month <- as.data.frame(table(employment.current_depressed.12Month[employment.current_depressed.12Month$depressed.12Month == 'Endorsed', c(2, 3)]))[1:(length(unique(employment.current_depressed.12Month$employment.current))),-2] %>% 
  rename("positive" = Freq)

employment.current_depressed.12Month <- cbind(employment.current_Pop, employment.current_depressedPop.12Month[-4,2]) %>% rename(positive = 'employment.current_depressedPop.12Month[-4, 2]') %>% mutate(rate = positive / employment.currentPop)
employment.current_depressed.12Month <- employment.current_depressed.12Month[employment.current_depressed.12Month$employment.currentPop >= 30,]

employment.current_depressed.12Month$employment.current <- factor(employment.current_depressed.12Month$employment.current, 
                                                                  levels = unique(employment.current_depressed.12Month$employment.current)[order(employment.current_depressed.12Month$rate, decreasing = F)])
employment.current_Pop$employment.current <- factor(employment.current_depressed.12Month$employment.current, 
                                                    levels = unique(employment.current_depressed.12Month$employment.current)[order(employment.current_depressed.12Month$rate, decreasing = F)])

employment.currentDist.eda <- plot_ly(employment.current_Pop, x = ~employment.current, y = ~employment.currentPop, type = 'bar') %>% 
  layout(title = "Population by Employment Status", xaxis = list(title = "Employment Status"), yaxis = list(title = "Population", range = c(0, 14000)))
employment.currentDist.eda
employment.current_depressed.12Month.eda <- plot_ly(employment.current_depressed.12Month, x = ~employment.current, y = ~rate, type = 'bar') %>% 
  layout(title = "% Positive for MDD by Employment Status", xaxis = list(title = "Employment Status"), yaxis = list(title = "% Positive with MDD", range = c(0,0.5)))
employment.current_depressed.12Month.eda
Employment distribution generally resembles that of the United States, save for a slightly higher than expected rate of respondents not in the labor force. Employment appears to lower depression rates, as “Not in Labor Force” and “Unemployed” groups have the highest depression rates.

Education

contDupli <- cont[cont$age >= 23,]

caseID_yearsOfEducation.respondent <- data.frame("CASEID" = contDupli$CASEID, "yearsOfEducation.respondent" = contDupli$yearsOfEducation.respondent)
caseID_depressed.lifetime <- data.frame("CASEID" = disorder$CASEID, "depressed.lifetime" = disorder$disorder.lifetime)
yearsOfEducation.respondent_depressed.lifetime <- inner_join(caseID_yearsOfEducation.respondent, caseID_depressed.lifetime, by = "CASEID")

yearsOfEducation.respondent_Pop <- as.data.frame(table(yearsOfEducation.respondent_depressed.lifetime[,2])) %>% 
  rename(yearsOfEducation.respondent = Var1) %>% rename("yearsOfEducation.respondentPop" = Freq)
yearsOfEducation.respondent_depressedPop.lifetime <- as.data.frame(table(yearsOfEducation.respondent_depressed.lifetime[yearsOfEducation.respondent_depressed.lifetime$depressed.lifetime == 'Endorsed', c(2, 3)]))[1:(length(unique(yearsOfEducation.respondent_depressed.lifetime$yearsOfEducation.respondent))),-2] %>% 
  rename("positive" = Freq)

yearsOfEducation.respondent_depressed.lifetime <- cbind(yearsOfEducation.respondent_Pop, yearsOfEducation.respondent_depressedPop.lifetime[-15,2]) %>% rename(positive = 'yearsOfEducation.respondent_depressedPop.lifetime[-15, 2]') %>% mutate(rate = positive / yearsOfEducation.respondentPop)
yearsOfEducation.respondent_depressed.lifetime <- yearsOfEducation.respondent_depressed.lifetime[yearsOfEducation.respondent_depressed.lifetime$yearsOfEducation.respondentPop >= 30,]

yearsOfEducation.respondentDist.eda <- plot_ly(yearsOfEducation.respondent_Pop, x = ~yearsOfEducation.respondent, y = ~yearsOfEducation.respondentPop, type = 'bar') %>% 
  layout(title = "Population by Years of Education", xaxis = list(title = "Years of Education"), yaxis = list(title = "Population", range = c(0, 6000)))
yearsOfEducation.respondentDist.eda
yearsOfEducation.respondent_depressed.lifetime.eda <- plot_ly(yearsOfEducation.respondent_depressed.lifetime, x = ~yearsOfEducation.respondent, y = ~rate, type = 'scatter', mode = 'lines') %>% 
  layout(title = "% Diagnosed with MDD by Years of Education (Lifetime)", xaxis = list(title = "Years of Education"), yaxis = list(title = "% Diagnosed with MDD (Lifetime)", range = c(0,0.5)))
yearsOfEducation.respondent_depressed.lifetime.eda

The education distribution is as expected, and representative of the United States, with an expected peak at 12 years. To plot depression rates, respondents of less than 23 years of age were excluded for the purpose of modeling lifetime risk of depression. Save for a peak at 4 years of education, depression rates generally trend upwards as people get more educated, until it plateaus at around 13.

Pairwise

pairs.panels(cont[2:27])

#TODO: Exclude depression variables

Pairwise plot of all continuous variables. High level assessment for multicollinearity, and potential interesting relationships between variables.

Assessing 30 day Risk Of Testing Positive for MDD

All predictive modeling techniques below will use the same testing set (NAs omitted) to determine MCE, Sensitivity, and Specificity. Training sets will be modified accordingly as not all models natively handle NAs. Via random sampling, all training sets are balanced to ensure an about equal distribution of endorsed and not endorsed observations. This is to prevent the model learning to predict consistently “not endorsed” without proper consideration of feature values, as our data is heavily skewed in that manner. The base training and testing set however, is made of a 65:20 split of all observations where 30 day depression endorsement is valid. The remaining 15 are allocated to a validation set for final model comparison.

Variables of interest relevant to a 30 day prediction are: physicalHealth socialLifeDifficulty, healthWorkAbsence, difficultyRemembering, daysMobilityDifficult, daysOvertime, difficultyConcentrating, difficultyUnderstandingSituation, and jobPerformance.

Bolded variables have a missingness of less than 60%

#Create data: dataframe of all relevant vars to whether or not one has disorder in 30 day timeframe
caseID_disorder.30day <- disorder[,c(1, 4)]
relevantVarsCont <- select(cont, c("healthWorkAbsence.30day", "daysMobilityDifficult.30day", "daysOvertime.30day", "jobPerformance.30day", "CASEID"))
relevantVarsCat <- select(cat, c("physicalHealth.30day", "difficultyConcentrating.30day", "difficultyUnderstandingSituation.30day", "socialLifeDifficulty.30day", "difficutlyRemembering.30day", "CASEID"))
relevantVars <- merge(relevantVarsCat, relevantVarsCont, by = "CASEID")
data <- merge(relevantVars, caseID_disorder.30day, by = "CASEID")
data <- data[is.na(data$disorder.30day) == F,-1]
levels(data$disorder.30day) <- list('1' = "Endorsed", '0' = "Not Endorsed")

#Save unimputed data
if (file.exists("data/30dayData_unimputed.Rda")) {
  file.remove("data/30dayData_unimputed.Rda")
}
saveRDS(data, file = "data/30dayData_unimputed.Rda")

#Create and save imputed data
set.seed(100)
data.imputed <- rfImpute(disorder.30day~., data = data)
if (file.exists("data/30dayData_imputed.Rda")) {
  file.remove("data/30dayData_imputed.Rda")
}
saveRDS(data.imputed, file = "data/30dayData_imputed.Rda")

#Create ind: index used to create training and test sets
set.seed(101)
ind <- sample(3, nrow(data), replace = T, prob = c(0.65, 0.2, 0.15))

#Prepare unimputed sets
train <- data[ind == 1,]
if (file.exists("data/30dayTrain_unimputed.Rda")) {
  file.remove("data/30dayTrain_unimputed.Rda")
}
saveRDS(train, file = "data/30dayTrain_unimputed.Rda")
testOne <- data[ind == 2,]
if (file.exists("data/30dayTestOne_unimputed.Rda")) {
  file.remove("data/30dayTestOne_unimputed.Rda")
}
saveRDS(testOne, file = "data/30dayTestOne_unimputed.Rda")
testTwo <- data[ind == 3,]
if (file.exists("data/30dayTestTwo_unimputed.Rda")) {
  file.remove("data/30dayTestTwo_unimputed.Rda")
}
saveRDS(testTwo, file = "data/30dayTestTwo_unimputed.Rda")

#Prepare imputed sets
train <- data.imputed[ind == 1,]
if (file.exists("data/30dayTrain_imputed.Rda")) {
  file.remove("data/30dayTrain_imputed.Rda")
}
saveRDS(train, file = "data/30dayTrain_imputed.Rda")

testOne <- data.imputed[ind == 2,]
if (file.exists("data/30dayTestOne_imputed.Rda")) {
  file.remove("data/30dayTestOne_imputed.Rda")
}
saveRDS(testOne, file = "data/30dayTestOne_imputed.Rda")

testTwo <- data.imputed[ind == 3,]
if (file.exists("data/30dayTestTwo_imputed.Rda")) {
  file.remove("data/30dayTestTwo_imputed.Rda")
}
saveRDS(testTwo, file = "data/30dayTestTwo_imputed.Rda")

Null Model

Assume MCE = 50%, Sensitivity: 50%, Specificity: 50%

Single Tree

# Imputed (Initial)
# Accuracy : 0.772         
# Sensitivity : 0.6029        
# Specificity : 0.7758  

# Imputed (First Iteration)
# Accuracy : 0.78         
# Sensitivity : 0.588        
# Specificity : 0.784  

# Imputed (Second Iteration)
# Accuracy : 0.83         
# Sensitivity : 0.573        
# Specificity : 0.835

# Imputed (Third Iteration)
# Accuracy : 0.828         
# Sensitivity : 0.6029        
# Specificity : 0.8328

# Imputed (Fourth Iteration)
# Accuracy : 0.828         
# Sensitivity : 0.6029        
# Specificity : 0.8328

# Imputed (Fifth Iteration) OPTIMAL
# Accuracy : 0.828         
# Sensitivity : 0.6029        
# Specificity : 0.8331

# Imputed (Sixth Iteration)
# Accuracy : 0.834        
# Sensitivity : 0.5882       
# Specificity : 0.8395

# Imputed (Seventh Iteration)
# Accuracy : 0.892    
# Sensitivity : 0.4706       
# Specificity : 0.9015

# Imputed (Eighth Iteration)
# Accuracy : 0.913
# Sensitivity : 0.35294       
# Specificity : 0.92527
train <- readRDS("data/30dayTrain_imputed.Rda")

# Unimputed (initial)
# Accuracy : 0.72          
# Sensitivity : 0.7353        
# Specificity : 0.7198  
# train <- readRDS("data/30dayTrain_unimputed.Rda")

train.endorsed <- train[train$disorder.30day == "1",]
train.notEndorsed <- train[train$disorder.30day == "0",]
prob <- nrow(train.endorsed)/nrow(train.notEndorsed)
set.seed(99)
ind <- sample(2, nrow(train.notEndorsed), replace = T, prob = c(prob, 1-prob))
train.notEndorsed <- train.notEndorsed[ind == 1,]
train.balanced <- rbind(train.notEndorsed, train.endorsed)

testOne <- readRDS("data/30dayTestOne_unimputed.Rda")

tree.fit <- rpart(disorder.30day~., data = train.balanced, method = "class")
tree.fit.pred <- predict(tree.fit, testOne, type = "class")
confusionMatrix(tree.fit.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   41  669
##          0   27 2315
##                                         
##                Accuracy : 0.772         
##                  95% CI : (0.757, 0.787)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.067         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.6029        
##             Specificity : 0.7758        
##          Pos Pred Value : 0.0577        
##          Neg Pred Value : 0.9885        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0134        
##    Detection Prevalence : 0.2326        
##       Balanced Accuracy : 0.6894        
##                                         
##        'Positive' Class : 1             
## 
tree.fit$variable.importance[order(tree.fit$variable.importance)]
##                   jobPerformance.30day            daysMobilityDifficult.30day 
##                                   6.73                                   7.38 
##                     daysOvertime.30day                   physicalHealth.30day 
##                                   9.71                                  10.39 
##            difficutlyRemembering.30day          difficultyConcentrating.30day 
##                                  13.62                                  13.74 
## difficultyUnderstandingSituation.30day                healthWorkAbsence.30day 
##                                  23.26                                  24.50 
##             socialLifeDifficulty.30day 
##                                  28.75
train.balanced.iter1 <- select(train.balanced, -c("jobPerformance.30day"))
tree.fit.iter1 <- rpart(disorder.30day~., data = train.balanced.iter1, method = "class")
tree.fit.iter1.pred <- predict(tree.fit.iter1, testOne, type = "class")
confusionMatrix(tree.fit.iter1.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   40  644
##          0   28 2340
##                                         
##                Accuracy : 0.78          
##                  95% CI : (0.765, 0.794)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.069         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.5882        
##             Specificity : 0.7842        
##          Pos Pred Value : 0.0585        
##          Neg Pred Value : 0.9882        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0131        
##    Detection Prevalence : 0.2241        
##       Balanced Accuracy : 0.6862        
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter1$variable.importance[order(tree.fit.iter1$variable.importance)]
##            daysMobilityDifficult.30day                   physicalHealth.30day 
##                                   5.72                                   9.71 
##                     daysOvertime.30day            difficutlyRemembering.30day 
##                                   9.82                                  12.98 
##          difficultyConcentrating.30day difficultyUnderstandingSituation.30day 
##                                  13.11                                  22.80 
##                healthWorkAbsence.30day             socialLifeDifficulty.30day 
##                                  24.50                                  28.75
train.balanced.iter2 <- select(train.balanced, -c("jobPerformance.30day", "daysMobilityDifficult.30day"))
tree.fit.iter2 <- rpart(disorder.30day~., data = train.balanced.iter2, method = "class")
tree.fit.iter2.pred <- predict(tree.fit.iter2, testOne, type = "class")
confusionMatrix(tree.fit.iter2.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   39  490
##          0   29 2494
##                                         
##                Accuracy : 0.83          
##                  95% CI : (0.816, 0.843)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.095         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.5735        
##             Specificity : 0.8358        
##          Pos Pred Value : 0.0737        
##          Neg Pred Value : 0.9885        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0128        
##    Detection Prevalence : 0.1733        
##       Balanced Accuracy : 0.7047        
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter2$variable.importance[order(tree.fit.iter2$variable.importance)]
##                     daysOvertime.30day                   physicalHealth.30day 
##                                   1.18                                   2.96 
##          difficultyConcentrating.30day            difficutlyRemembering.30day 
##                                   5.94                                   9.07 
##                healthWorkAbsence.30day difficultyUnderstandingSituation.30day 
##                                  11.06                                  21.42 
##             socialLifeDifficulty.30day 
##                                  28.75
train.balanced.iter3 <- select(train.balanced, -c("jobPerformance.30day", "daysMobilityDifficult.30day", "daysOvertime.30day"))
tree.fit.iter3 <- rpart(disorder.30day~., data = train.balanced.iter3, method = "class")
tree.fit.iter3.pred <- predict(tree.fit.iter3, testOne, type = "class")
confusionMatrix(tree.fit.iter3.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   41  499
##          0   27 2485
##                                         
##                Accuracy : 0.828         
##                  95% CI : (0.814, 0.841)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.099         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.6029        
##             Specificity : 0.8328        
##          Pos Pred Value : 0.0759        
##          Neg Pred Value : 0.9893        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0134        
##    Detection Prevalence : 0.1769        
##       Balanced Accuracy : 0.7179        
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter3$variable.importance[order(tree.fit.iter3$variable.importance)]
##                   physicalHealth.30day          difficultyConcentrating.30day 
##                                   2.96                                   5.94 
##            difficutlyRemembering.30day                healthWorkAbsence.30day 
##                                   9.07                                  11.06 
## difficultyUnderstandingSituation.30day             socialLifeDifficulty.30day 
##                                  21.42                                  28.75
train.balanced.iter4 <- select(train.balanced, -c("daysOvertime.30day", "jobPerformance.30day", "daysMobilityDifficult.30day", "physicalHealth.30day"))
tree.fit.iter4 <- rpart(disorder.30day~., data = train.balanced.iter4, method = "class")
tree.fit.iter4.pred <- predict(tree.fit.iter4, testOne, type = "class")
confusionMatrix(tree.fit.iter4.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   41  499
##          0   27 2485
##                                         
##                Accuracy : 0.828         
##                  95% CI : (0.814, 0.841)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.099         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.6029        
##             Specificity : 0.8328        
##          Pos Pred Value : 0.0759        
##          Neg Pred Value : 0.9893        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0134        
##    Detection Prevalence : 0.1769        
##       Balanced Accuracy : 0.7179        
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter4$variable.importance[order(tree.fit.iter4$variable.importance)]
##          difficultyConcentrating.30day            difficutlyRemembering.30day 
##                                   5.94                                   9.07 
##                healthWorkAbsence.30day difficultyUnderstandingSituation.30day 
##                                  11.06                                  21.42 
##             socialLifeDifficulty.30day 
##                                  28.75
train.balanced.iter5 <- select(train.balanced, -c("daysOvertime.30day", "jobPerformance.30day", "daysMobilityDifficult.30day", "physicalHealth.30day", "difficultyConcentrating.30day"))
tree.fit.iter5 <- rpart(disorder.30day~., data = train.balanced.iter5, method = "class")
tree.fit.iter5.pred <- predict(tree.fit.iter5, testOne, type = "class")
confusionMatrix(tree.fit.iter5.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   41  498
##          0   27 2486
##                                         
##                Accuracy : 0.828         
##                  95% CI : (0.814, 0.841)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.099         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.6029        
##             Specificity : 0.8331        
##          Pos Pred Value : 0.0761        
##          Neg Pred Value : 0.9893        
##              Prevalence : 0.0223        
##          Detection Rate : 0.0134        
##    Detection Prevalence : 0.1766        
##       Balanced Accuracy : 0.7180        
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter5$variable.importance[order(tree.fit.iter5$variable.importance)]
##            difficutlyRemembering.30day                healthWorkAbsence.30day 
##                                   9.07                                  11.06 
## difficultyUnderstandingSituation.30day             socialLifeDifficulty.30day 
##                                  21.42                                  28.75
train.balanced.iter6 <- select(train.balanced, -c("daysOvertime.30day", "jobPerformance.30day", "daysMobilityDifficult.30day", "physicalHealth.30day", "difficultyConcentrating.30day", "difficutlyRemembering.30day"))
tree.fit.iter6 <- rpart(disorder.30day~., data = train.balanced.iter6, method = "class")
tree.fit.iter6.pred <- predict(tree.fit.iter6, testOne, type = "class")
confusionMatrix(tree.fit.iter6.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   40  479
##          0   28 2505
##                                        
##                Accuracy : 0.834        
##                  95% CI : (0.82, 0.847)
##     No Information Rate : 0.978        
##     P-Value [Acc > NIR] : 1            
##                                        
##                   Kappa : 0.101        
##                                        
##  Mcnemar's Test P-Value : <2e-16       
##                                        
##             Sensitivity : 0.5882       
##             Specificity : 0.8395       
##          Pos Pred Value : 0.0771       
##          Neg Pred Value : 0.9889       
##              Prevalence : 0.0223       
##          Detection Rate : 0.0131       
##    Detection Prevalence : 0.1701       
##       Balanced Accuracy : 0.7139       
##                                        
##        'Positive' Class : 1            
## 
tree.fit.iter6$variable.importance[order(tree.fit.iter6$variable.importance)]
##                healthWorkAbsence.30day difficultyUnderstandingSituation.30day 
##                                   14.6                                   21.2 
##             socialLifeDifficulty.30day 
##                                   28.8
train.balanced.iter7 <- select(train.balanced, -c("daysOvertime.30day", "jobPerformance.30day", "daysMobilityDifficult.30day", "physicalHealth.30day", "difficultyConcentrating.30day", "difficutlyRemembering.30day", "healthWorkAbsence.30day"))
tree.fit.iter7 <- rpart(disorder.30day~., data = train.balanced.iter7, method = "class")
tree.fit.iter7.pred <- predict(tree.fit.iter7, testOne, type = "class")
confusionMatrix(tree.fit.iter7.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   32  294
##          0   36 2690
##                                        
##                Accuracy : 0.892        
##                  95% CI : (0.88, 0.903)
##     No Information Rate : 0.978        
##     P-Value [Acc > NIR] : 1            
##                                        
##                   Kappa : 0.13         
##                                        
##  Mcnemar's Test P-Value : <2e-16       
##                                        
##             Sensitivity : 0.4706       
##             Specificity : 0.9015       
##          Pos Pred Value : 0.0982       
##          Neg Pred Value : 0.9868       
##              Prevalence : 0.0223       
##          Detection Rate : 0.0105       
##    Detection Prevalence : 0.1068       
##       Balanced Accuracy : 0.6860       
##                                        
##        'Positive' Class : 1            
## 
tree.fit.iter7$variable.importance[order(tree.fit.iter7$variable.importance)]
## difficultyUnderstandingSituation.30day             socialLifeDifficulty.30day 
##                                   21.2                                   28.8
train.balanced.iter8 <- select(train.balanced, -c("daysOvertime.30day", "jobPerformance.30day", "daysMobilityDifficult.30day", "physicalHealth.30day", "difficultyConcentrating.30day", "difficutlyRemembering.30day", "healthWorkAbsence.30day", "difficultyUnderstandingSituation.30day"))
tree.fit.iter8 <- rpart(disorder.30day~., data = train.balanced.iter8, method = "class")
tree.fit.iter8.pred <- predict(tree.fit.iter8, testOne, type = "class")
confusionMatrix(tree.fit.iter8.pred, testOne$disorder.30day)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1   24  223
##          0   44 2761
##                                         
##                Accuracy : 0.913         
##                  95% CI : (0.902, 0.922)
##     No Information Rate : 0.978         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.122         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.35294       
##             Specificity : 0.92527       
##          Pos Pred Value : 0.09717       
##          Neg Pred Value : 0.98431       
##              Prevalence : 0.02228       
##          Detection Rate : 0.00786       
##    Detection Prevalence : 0.08093       
##       Balanced Accuracy : 0.63910       
##                                         
##        'Positive' Class : 1             
## 
tree.fit.iter8$variable.importance[order(tree.fit.iter8$variable.importance)]
## socialLifeDifficulty.30day 
##                       28.8
tree.fit.iter5$variable.importance
##             socialLifeDifficulty.30day difficultyUnderstandingSituation.30day 
##                                  28.75                                  21.42 
##                healthWorkAbsence.30day            difficutlyRemembering.30day 
##                                  11.06                                   9.07
rpart.plot(tree.fit.iter5)

We identify our optimal tree as the one with the lowest overall MCE while maintaining a high Sensitivity and Specificity. Trees were trained on two versions of the training set, imputed and unimputed. Missing values in the imputed training set were imputed using rfImpute from the randomForest package. Our best tree was iteration 5 of the imputed training set, with MCE = 0.172, Sensitivity = 0.6029, Specificity = 0.8331, and important variables socialLifeDifficulty, difficultyUnderstandingSituation, healthWorkAbsence, and difficultyRemembering (ordered from most to least important)

Random Forest

Forests were grown from preimputed (rfImpute performed on entire dataset of interest variables, training set was then selected from imputed dataset), postimputed (rfImpute performed on training set selected from dataset still with NAs), and unimputed training sets (randomForest cannot handle NAs, so NAs were omitted for unimputed training sets). Additionally, the procedures were repeated for two sets of variables: “allVars” and “goodVars”. “allVars” contains all 9 variables of interest, while “goodVars” contains only the ones with missingness below 60%.

Once more, we identify our optimal forest as the one with the lowest overall MCE while maintaining a high Sensitivity and Specificity.

#Best True Positive Rate, Horrible Overall Accuracy/MCE

train <- readRDS("data/30dayTrain_imputed.Rda")

testOne <- na.omit(readRDS("data/30dayTestOne_unimputed.Rda"))

# Optimal mtry: 6
# OOB Error: 32.8%
# Sensitivity : 0.833         
# Specificity : 0.25 
# Accuracy : 0.326

#Postimputed Best Balance

train <- readRDS("data/30dayTrain_unimputed.Rda")
#Remove all rows with more "num" NAs. Set to 0 to do unimputed training
num <- 3
train <- train[(rowSums(is.na(train)) <= num),]
set.seed(102)
if(num != 0){
  train <- rfImpute(disorder.30day~., data = train) #Impute missing values
}
## ntree      OOB      1      2
##   300:   9.54% 98.06%  0.86%
## ntree      OOB      1      2
##   300:   9.11% 99.03%  0.29%
## ntree      OOB      1      2
##   300:   9.37% 99.03%  0.57%
## ntree      OOB      1      2
##   300:   9.02% 98.06%  0.29%
## ntree      OOB      1      2
##   300:   9.11%100.00%  0.19%
testOne <- na.omit(readRDS("data/30dayTestOne_unimputed.Rda"))

# Optimal num: 3
# Optimal mtry: 4
# OOB estimate of  error rate: 44.4%
# Accuracy: 0.63 
# Sensitivity: 0.667  
# Specificity: 0.625 

#Unimputed results in balanced training set of size 15

#Comparable to “Best Balance”. Better True Negative, Worse True Positive. Tested on larger test set.

train <- readRDS("data/30dayTrain_imputed.Rda")
train <- select(train, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day"))

testOne <- readRDS("data/30dayTestOne_unimputed.Rda")
testOne <- na.omit(select(testOne, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day")))

# Optimal mtry: 2
# OOB estimate of  error rate: 30.9%
# Accuracy: 0.689  
# Sensitivity: 0.55  
# Specificity: 0.694

#Best True Negative Rate & Overall Accuracy

train <- readRDS("data/30dayTrain_unimputed.Rda")
train <- select(train, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day"))
num <- 0 #Remove all rows with more "num" NAs. Set to 0 to do unimputed training
train <- train[(rowSums(is.na(train)) <= num),]
set.seed(103)
if(num != 0){
  train <- rfImpute(disorder.30day~., data = train)
}

testOne <- readRDS("data/30dayTestOne_unimputed.Rda")
testOne <- na.omit(select(testOne, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day")))

# Optimal num: 0 (Unimputed)
# Optimal mtry: 2
# OOB estimate of  error rate: 43%
# Accuracy: 0.843 
# Sensitivity: 0.475  
# Specificity: 0.857
#number of variables, 9 for allVars, 3 for goodVars
n <- 9

#Create balanced training set, where num endorsed ~ num not endorsed
train.endorsed <- train[train$disorder.30day == "1",]
train.notEndorsed <- train[train$disorder.30day == "0",]
prob <- nrow(train.endorsed)/nrow(train.notEndorsed)
set.seed(104)
ind <- sample(2, nrow(train.notEndorsed), replace = T, prob = c(prob, 1-prob))
train.notEndorsed <- train.notEndorsed[ind == 1,]
train.balanced <- rbind(train.notEndorsed, train.endorsed)

rf.error.p <- rep(0, n)
for (p in 1:n){
  rf.fit <- randomForest(disorder.30day~., data = train.balanced, ntree = 500, mtry = p)
  rf.error.p[p] <- rf.fit$err.rate[500]
}
plot(1:n, rf.error.p, pch=16,
     main = "OOB Error Rate for mtry With 500 Trees",
     xlab="mtry",
     ylab="OOB Error Rate")
lines(1:n, rf.error.p)

rf.error.p <- rep(0, n)
for (p in 1:n){
  rf.fit <- randomForest(disorder.30day~., data = train.balanced, ntree = 500, mtry = p)
  rf.fit.pred <- predict(rf.fit, testOne)
  rf.error.p[p] <- confusionMatrix(rf.fit.pred, testOne$disorder.30day)$byClass[1] #True Positive
}
plot(1:n, rf.error.p, pch=16,
     main = "TestOne True Positive Rate for mtry",
     xlab="mtry",
     ylab="True Positive Rate")
lines(1:n, rf.error.p)

rf.error.p <- rep(0, n)
for (p in 1:n){
  rf.fit <- randomForest(disorder.30day~., data = train.balanced, ntree = 500, mtry = p)
  rf.fit.pred <- predict(rf.fit, testOne)
  rf.error.p[p] <- confusionMatrix(rf.fit.pred, testOne$disorder.30day)$byClass[2] #True Negative
}
plot(1:n, rf.error.p, pch=16,
     main = "TestOne True Negative Rate for mtry",
     xlab="mtry",
     ylab="True Negative Rate")
lines(1:n, rf.error.p)

set.seed(105)
rf.fit <- randomForest(disorder.30day~., data = train.balanced, ntree = 500, mtry = 2, proximity = T) #Final Model
rf.fit
## 
## Call:
##  randomForest(formula = disorder.30day ~ ., data = train.balanced,      ntree = 500, mtry = 2, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 43%
## Confusion matrix:
##    1  0 class.error
## 1 69 81       0.540
## 0 46 99       0.317
varImpPlot(rf.fit, sort=T, n.var=min(10, nrow(rf.fit$importance)))

hist(treesize(rf.fit), main = "No. of nodes", col = "blue")

#CANNOT HANDLE NA VALUES FOR FEATURES IN TEST SETS
rf.fit.pred <- predict(rf.fit, testOne)
confusionMatrix(rf.fit.pred, testOne$disorder.30day)$overall[1] #Accuracy
## Accuracy 
##    0.843
confusionMatrix(rf.fit.pred, testOne$disorder.30day)$byClass[1] #True Positive
## Sensitivity 
##       0.475
confusionMatrix(rf.fit.pred, testOne$disorder.30day)$byClass[2] #True Negative
## Specificity 
##       0.857
MDSplot(rf.fit, testOne$disorder.30day)

The best model was our postimputed forest with mtry = 4, and number of NAs allowed in training set before imputation set to 3: MCE = 44.4%, Sensitivity = 0.667, Specificity = 0.625. Ranked by importance according to mean decrease gini, the most important variables were daysMobilityDifficult, jobPerformance, and daysOvertime. For important variables, this appears to contradict our single tree model.

LASSO

Like randomForest, our LASSO function cannot handle NA values in either the testing or the training sets. LASSO algorithms will trained on preimputed, postimputed, and unimputed training sets for both “allVars” and “goodVars”. The criterion for selecting optimal algorithm remains the same.

train <- readRDS("data/30dayTrain_imputed.Rda")
testOne <- na.omit(readRDS("data/30dayTestOne_unimputed.Rda"))

# Training Error: 0.27
# Testing Error: 0.152
# Testing Sensitivity: 0.333
# Testing Specificity: 0.884
#Optimal Threshold: 0.02
train <- readRDS("data/30dayTrain_unimputed.Rda")
#Remove all rows with more than "num" NAs. Set to 0 to do unimputed training
num <- 3
train <- train[(rowSums(is.na(train)) <= num),]
if(num != 0){
  set.seed(107)
  train <- rfImpute(disorder.30day~., data = train) #Impute missing values
}
## ntree      OOB      1      2
##   300:   9.71% 98.06%  1.05%
## ntree      OOB      1      2
##   300:   9.28%100.00%  0.38%
## ntree      OOB      1      2
##   300:   9.11% 99.03%  0.29%
## ntree      OOB      1      2
##   300:   9.11% 99.03%  0.29%
## ntree      OOB      1      2
##   300:   8.93% 99.03%  0.10%
testOne <- na.omit(readRDS("data/30dayTestOne_unimputed.Rda"))

# [1] "Training Error"
# [1] 0.401
# [1] "Testing Error"
# [1] 0.109
# [1] "Testing Sensitivity"
# [1] 0.667
# [1] "Testing Specificity"
# [1] 0.907
# Optimal num: 3
# Optimal threshold: 0.48
train <- readRDS("data/30dayTrain_imputed.Rda")
train <- select(train, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day"))

testOne <- readRDS("data/30dayTestOne_unimputed.Rda")
testOne <- na.omit(select(testOne, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day")))

# Training Error: 0.42
# Testing Error: 0.0466
# Testing Sensitivity: 0.167
# Testing Specificity: 0.966
# Optimal Threshold: 0.38
train <- readRDS("data/30dayTrain_unimputed.Rda")
train <- select(train, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day"))
num <- 2 #Remove all rows with more "num" NAs. Set to 0 to do unimputed training
train <- train[(rowSums(is.na(train)) <= num),]
set.seed(108)
if(num != 0){
  train <- rfImpute(disorder.30day~., data = train)
}
## ntree      OOB      1      2
##   300:   3.31%100.00%  0.00%
## ntree      OOB      1      2
##   300:   3.31%100.00%  0.00%
## ntree      OOB      1      2
##   300:   3.31%100.00%  0.00%
## ntree      OOB      1      2
##   300:   3.31%100.00%  0.00%
## ntree      OOB      1      2
##   300:   3.31%100.00%  0.00%
testOne <- readRDS("data/30dayTestOne_unimputed.Rda")
testOne <- na.omit(select(testOne, c("physicalHealth.30day", "socialLifeDifficulty.30day", "healthWorkAbsence.30day", "disorder.30day")))

# Optimal num: 0 (Unimputed)
# Training Error: 43%
# Testing Error: 0.0618 
# Sensitivity: 0.128  
# Specificity: 0.968
# Optimal Threshold: 0.38
#Pairwise plot of numeric features. Testing for collinearity. Active only for allVars
# pairs.panels(train[c(-1, -2, -3, -4, -5, -6, -10)])

#Form balanced training data
train.endorsed <- train[train$disorder.30day == "1",]
train.notEndorsed <- train[train$disorder.30day == "0",]
prob <- nrow(train.endorsed)/nrow(train.notEndorsed)
set.seed(109)
ind <- sample(2, nrow(train.notEndorsed), replace = T, prob = c(prob, 1-prob))
train.notEndorsed <- train.notEndorsed[ind == 1,]
train.balanced <- rbind(train.notEndorsed, train.endorsed)

# Specify vars - allVars: 1, goodVars: 4
varType <- 1 #######################################################################MODIFY!

#Isolate target
ytrain <- train.balanced[, varType]
#Isolate features
xtrain <- model.matrix(disorder.30day~., data = train.balanced)[, -varType]

#Try different lambda values
set.seed(110)
CV = cv.glmnet(x = xtrain, y = ytrain, family = "binomial", type.measure = "class", nfolds = 5, alpha = 1, nlambda = 100)
#Train lasso using previously identified optimal lambda
set.seed(111)
lasso.fit <- glmnet(x = xtrain, y = ytrain, family = "binomial", nfolds = 5, alpha = 1, lambda = CV$lambda.min)

#Plot Cross Validation for visualization
plot(CV)

lasso.fit.pred <- predict(lasso.fit, model.matrix(disorder.30day~., data = train.balanced)[, -1], type = "class")
predictions <- as.data.frame(lasso.fit.pred) %>% rename("predicted" = s0)
actual <- as.data.frame(train.balanced$disorder.30day) %>% rename("actual" = 'train.balanced$disorder.30day')
confusionMatrix <- as.data.frame(table(cbind(actual, predictions)))
oobError <- 1 - sum(confusionMatrix[confusionMatrix$actual == confusionMatrix$predicted, confusionMatrix$actual == confusionMatrix$predicted]$Freq) /
  sum(confusionMatrix$Freq)
###OOB ERROR
print("Training Error")
## [1] "Training Error"
oobError
## [1] 0.342
#Use lasso to predict target using features from testOne
toTest <- testOne ######################################################################################################MODIFY!
features <- model.matrix(disorder.30day~., data = toTest)[, -varType]
lasso.fit.pred <- predict(lasso.fit, features, type = "response")

#Visualize logistic regression probabilities
plot(lasso.fit.pred)

#Tuning
thresholdTests <- data.frame(matrix(0, nrow = 4, ncol = 100))
row.names(thresholdTests) <- c("threshold", "accuracy", "sensitivity", "specificity")

for(i in as.integer(100*min(lasso.fit.pred)+1):as.integer(100*max(lasso.fit.pred)-1))
{
  threshold <- i/100
  lasso.fit.testPred <- ifelse(lasso.fit.pred > threshold, "0", "1")
  predictions <- as.data.frame(lasso.fit.testPred) %>% rename("predicted" = s0)
  actual <- as.data.frame(toTest$disorder.30day) %>% rename("actual" = 'toTest$disorder.30day')
  confusionMatrix <- as.data.frame(table(cbind(actual, predictions)))
  thresholdTests[1,i] <- threshold
  thresholdTests[2,i] <- sum(confusionMatrix[confusionMatrix$actual == confusionMatrix$predicted, confusionMatrix$actual == confusionMatrix$predicted]$Freq) /
    sum(confusionMatrix$Freq)
  thresholdTests[3,i] <- confusionMatrix[confusionMatrix$actual == 1 & confusionMatrix$predicted == 1,]$Freq /
    sum(confusionMatrix[confusionMatrix$predicted == 1,]$Freq)
  thresholdTests[4,i] <- confusionMatrix[confusionMatrix$actual == 0 & confusionMatrix$predicted == 0,]$Freq /
    sum(confusionMatrix[confusionMatrix$predicted == 0,]$Freq)
}

length <- as.integer(100*max(lasso.fit.pred)-1) - as.integer(100*min(lasso.fit.pred)+1) + 1
thresholdVec <- rep(0, length)
accuracyVec <- rep(0, length)
sensitivityVec <- rep(0, length)
specificityVec <- rep(0, length)
for(i in 1:length)
{
  thresholdVec[i] <- thresholdTests[1,i+as.integer(100*min(lasso.fit.pred)+1)-2]
  accuracyVec[i] <- thresholdTests[2,i+as.integer(100*min(lasso.fit.pred)+1)-2]
  sensitivityVec[i] <- thresholdTests[3,i+as.integer(100*min(lasso.fit.pred)+1)-2]
  specificityVec[i] <- thresholdTests[4,i+as.integer(100*min(lasso.fit.pred)+1)-2]
}

threshold.dat <- data.frame("threshold" = thresholdVec, "accuracy" = accuracyVec, "sensitivity" = sensitivityVec, "specificity" = specificityVec)

thresholds.graph <- plot_ly(threshold.dat, x = ~threshold, y = ~accuracy, type = 'scatter', mode = 'lines', name = "accuracy") %>% 
  add_trace(x = ~threshold, y = ~sensitivity, type = 'scatter', mode = 'lines', name = "sensitivity") %>%
  add_trace(x = ~threshold, y = ~specificity, type = 'scatter', mode = 'lines', name = "specificity") %>%
  layout(title = "testOne accuracy measures vs threshold", xaxis = list(title = "threshold"), yaxis = list(title = "accuracy"))
thresholds.graph
#Note: Lasso appears to be calculating probability for not endorsed
lasso.fit.pred <- ifelse(lasso.fit.pred > 0.48, "0", "1") ##########################################################################MODIFY!
plot(lasso.fit.pred)

predictions <- as.data.frame(lasso.fit.pred) %>% rename("predicted" = s0)
actual <- as.data.frame(toTest$disorder.30day) %>% rename("actual" = 'toTest$disorder.30day')
confusionMatrix <- as.data.frame(table(cbind(actual, predictions)))
mce <- 1 - sum(confusionMatrix[confusionMatrix$actual == confusionMatrix$predicted, confusionMatrix$actual == confusionMatrix$predicted]$Freq) /
  sum(confusionMatrix$Freq)
sensitivity <- confusionMatrix[confusionMatrix$actual == 1 & confusionMatrix$predicted == 1,]$Freq / 
  sum(confusionMatrix[confusionMatrix$predicted == 1,]$Freq)
specificity <- confusionMatrix[confusionMatrix$actual == 0 & confusionMatrix$predicted == 0,]$Freq / 
  sum(confusionMatrix[confusionMatrix$predicted == 0,]$Freq)

print("Testing Error")
## [1] "Testing Error"
mce
## [1] 0.14
print("Testing Sensitivity")
## [1] "Testing Sensitivity"
sensitivity
## [1] 0.0972
print("Testing Specificity")
## [1] "Testing Specificity"
specificity
## [1] 0.973
lasso.fit$beta
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                                        s0
## physicalHealth.30dayWorse           .    
## physicalHealth.30dayAbout The Same  .    
## socialLifeDifficulty.30dayNo        0.395
## healthWorkAbsence.30day            -0.020

The optimal algorithm was our LASSO trained on a allVars postimputed training set, where initial NAs allowed per row was set to 3 and threshold was set to 0.48: MCE = 0.109, Sensitivity = 0.667, Specificity = 0.907. difficultyUnderstandingSituation, socialLifeDifficulty, and healthWorkAbsence had non-zero coefficients in the model

boosting

deep learning

k nearest neighbor

naive bayes

Variable Documentation (TODO)

Sources: https://www.icpsr.umich.edu/web/ICPSR/studies/20240/summary (Publicly Available Dataset Download) https://www.icpsr.umich.edu/web/DSDR/search/variables?q=yescount&STUDYQ=20240 (Variable Documentation)

 

Alexander Chen

With A Very Special Thanks To Prof. Linda Zhao